package com.asiainfo.weixinc;

import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import com.asiainfo.souhu.ZhouSouhu;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import okhttp3.Request;
import org.json.JSONObject;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.sound.midi.Soundbank;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.reflect.Array;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;

/**
 * WeixinCrawler
 * cookie需要自己添加：具体为个人公众号，在公众号中发表文章时会需要引用其他人的公众号文章，以此得到数据
 *
 * @Author jhr
 * @Date 2022/5/6
 */
public class WeixinCrawler extends BreadthCrawler {
    /**
     * 构造一个基于伯克利DB的爬虫
     * 伯克利DB文件夹为crawlPath，crawlPath中维护了历史URL等信息
     * 不同任务不要使用相同的crawlPath
     * 两个使用相同crawlPath的爬虫并行爬取会产生错误
     *
     * @param crawlPath 伯克利DB使用的文件夹
     * @param autoParse 是否根据设置的正则自动探测新URL
     */
    private static Logger logger = LoggerFactory.getLogger(ZhouSouhu.class);
    private static String crawlPath = "./weixin";
    //    static String RootUrl = "https://mp.sohu.com/profile?xpt=d3VodWRzekB5YWhvby5jb20uY24="; //周蓬安的搜狐号首页
    static String RootUrl = "";
    //    接口url：获取每篇文章的url,改变pNo后的数值，实现获取信息
//    static String IUrl = "https://v2.sohu.com/author-page-api/author-articles/pc/115438?pNo=1";
    static String Seed = "";
    static String IUrl = "";
    static String[] Seeds;
    //    某一个文章的url
//    static String seed = "https://www.sohu.com/a/408149269_115438";
    //    匹配文章信息的URL
//    static String RegularUrl = "https://www.sohu.com/a/.*_115438";
    static String RegularUrl = "";
    public WeixinCrawler(String Seed, String regularUrl) {
        super(crawlPath, false);
        IUrl = Seed;
        RegularUrl = regularUrl;
//        crawlPath = CrawlPath;
        // 设置请求插件
        setRequester(new MyRequester());
        CrawlDatum crawlDatum = new CrawlDatum(IUrl).meta("depth", "2");
        addSeed(crawlDatum);
//        this.addRegex(RegularUrl);
        setThreads(1);

//        //        fakeid为公众号的微信号，可以在微信公众平台查询
////        小债看市：MzI3OTQ1ODUzOQ==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzI3OTQ1ODUzOQ==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        崔永元 MzU1NTY0MDQ5MA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzU1NTY0MDQ5MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        雷斯林 为你写一个故事 MzA5MzAyMzE4Nw==  雷斯林日记 MzA4OTI3Mjc5Mw==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA5MzAyMzE4Nw==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA4OTI3Mjc5Mw==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        高晓松 晓松奇谈 MzUxNzk1MDE4NA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzUxNzk1MDE4NA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        贺卫方 静观天下 MzA3MzI0NzM2MA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA3MzI0NzM2MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        何兵  何兵 MzI0NDM2MTEzMQ==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzU1NTY0MDQ5MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        罗永浩 MjM5NzAxNTkzNg==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MjM5NzAxNTkzNg==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        司马南  司马南频道 MzAxODIwNTQ1MA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAxODIwNTQ1MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
//        addSeed(crawlDatum);

    }
    public static class MyRequester extends OkHttpRequester {
        String userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36";
        String cookie = "appmsglist_action_3890764623=card; pgv_pvid=6578469680; fqm_pvqid=2b1bb6f5-4e7c-4a45-b8be-7b2ec872f02c; RK=eGh12bteOC; ptcz=9ecf25a896c448ce9c5b5119f43c575dec62d16e180ea5852d74b7b759abbe47; o_cookie=505171123; pac_uid=1_505171123; tvfe_boss_uuid=982d8263212503aa; _tc_unionid=3440346c-a197-4f5c-aeb6-81d13585e7fb; pt_sms_phone=134******31; ptui_loginuin=505171123@qq.com; ua_id=M3dYVgPHs4PmSwz4AAAAAKrryUPtTYJAZw5-39BKYyI=; wxuin=48804152305337; mm_lang=zh_CN; luin=o0505171123; lskey=000100008cbb023351d13ec1c6b3c6dcba655065927bc230a923418d14913dfad6104dc19dbce0a7ce079de2; iip=0; uuid=ad02289acb932586b75b9fe04c7cb7e9; rand_info=CAESIBuqZ9GLVC51o9Y5ePe4Qm6GLNwMBL7TQigwknuFF4cR; slave_bizuin=3890764623; data_bizuin=3890764623; bizuin=3890764623; data_ticket=2R1SgzOA4TPRmIF4xl3qVABojg7+5UNUVbal+RywDvhbHnKHrusN2g7200RP2mVh; slave_sid=QlRwcnBMNktod1BkYmZUSUQ2ckVWbklRbEkyamJBVVZXY01OaDBwMkJRcE01RUtkV2h1dmJmczFrSHd0RXV0YWpuSWNVU1RFRG11MGxCRzlVNEJBQ0ZwXzNndUVsSDBiZVJmalVacXZTUHJmNVdfWVpSaXhSbDZEODBwOWVtNk40NzJZSVFDZXhYTDhJaUpw; slave_user=gh_81e94a1da400; xid=5b8c4d6a84da73bb566486b007e89674; mmad_session=c9ae15ff79ec58692d184f8e597aa6eae8c9487382e004a1f47c10042bca1f181b0887be70435c898f6e32208a5184d656e09ed4097a1365d77d15d701af06d957db84300da9e289cab1b3234e480ce9af1466dc5f30876deb9f7f436bdc306b11de1c56c245721266e7088080fefde3";
        @Override
        public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
            return super.createRequestBuilder(crawlDatum).addHeader("User-Agent", userAgent).addHeader("Cookie", cookie);
        }
    }
    //        Json拿到的文章数据
    String title = "";
    String link = "";
    String create_time = "";
    String author = "";
    String content = "";
    @Override
    public void visit(Page page, CrawlDatums next) {
        if (!page.matchUrl("https://mp.weixin.qq.com/s.*")) {
            System.out.println("page.html()" + page.html());
//        Json 解析
            JsonObject jsonObject = new JsonParser().parse(page.html()).getAsJsonObject();
            JsonArray jsonArray = jsonObject.getAsJsonArray("app_msg_list");
            for (int i = 0; i < jsonArray.size(); i++) {
                JsonObject jsonObject1 = jsonArray.get(i).getAsJsonObject();
                title = jsonObject1.get("title").getAsString();
                link = jsonObject1.get("link").getAsString();
                create_time = jsonObject1.get("create_time").getAsString();  //unix时间戳
                System.out.println(link);
                if (link.matches("http://mp.weixin.qq.com/.*")) {
                    link = link.replace("http:", "https:");
                    if (create_time != null) {
//                        通过接口拿到数据
                        create_time = new Date(Long.parseLong(create_time) * 1000).toString();
                        CrawlDatum meta = new CrawlDatum(link).meta("depth", "2").meta("refer", Seed);
                        next.add(meta);
                    }
                } else {
                    System.out.println("正则URL不匹配！！！");
                }
            }
        } else {
            if (!"".equals(create_time)) {
                System.out.println("JSON拿到的create_time:" + create_time);
            }
            if (!"".equals(link)) {
                if (page.url()!=null){
                    link = page.url()==link?link:page.url();
                }
                System.out.println("JSON接口拿到的link:" + link);
            }
            if (!"".equals(title)) {
                String title1 = page.select("h1#activity-name").first().text();
                title = title1==title?title1:title;
                System.out.println("JSON接口拿到的title:" + title);
            }
//            System.out.println(page.url());
//            System.out.println("url:" + page.url());
//            System.out.println("标题:" + page.select("h1#activity-name").first().text());
//            System.out.println("作者为"+page.select("span#profileBt").first().text());
            String author1 = page.select("a#js_name").first().text();
            if (author1==null){
                author1 = page.select("strong# account_nickname_inner").first().text();
            }
            author = author1==author?author1:author;
            System.out.println("作者:" +author);
//            System.out.println("时间：" + page.select("em#publish_time").text());
//            System.out.println("正文:" + page.select("div.rich_media_content >p").text());
            String conetent = page.select("div#js_content").text().trim();
//            if (conetent.equals("")) {
//                conetent = page.select("div.rich_media_content >p").text();
//            }
            System.out.println("正文:" +conetent );
        }
        return;
    }

    public static void main(String[] args) throws Exception {
//        System.out.println(Seeds.length);
//        System.out.println(RegularUrl);
        //        fakeid为公众号的微信号，可以在微信公众平台查询
////        小债看市：MzI3OTQ1ODUzOQ==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzI3OTQ1ODUzOQ==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        崔永元 MzU1NTY0MDQ5MA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzU1NTY0MDQ5MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        雷斯林 为你写一个故事 MzA5MzAyMzE4Nw==  雷斯林日记 MzA4OTI3Mjc5Mw==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA5MzAyMzE4Nw==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA4OTI3Mjc5Mw==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        高晓松 晓松奇谈 MzUxNzk1MDE4NA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzUxNzk1MDE4NA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        贺卫方 静观天下 MzA3MzI0NzM2MA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA3MzI0NzM2MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        何兵  何兵 MzI0NDM2MTEzMQ==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzU1NTY0MDQ5MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        罗永浩 MjM5NzAxNTkzNg==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MjM5NzAxNTkzNg==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
////        司马南  司马南频道 MzAxODIwNTQ1MA==
//        addSeed("https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAxODIwNTQ1MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1");
        Seeds = new String[]{
//                小债看市：MzI3OTQ1ODUzOQ==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzI3OTQ1ODUzOQ==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
//               崔永元 MzU1NTY0MDQ5MA==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzU1NTY0MDQ5MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
//               雷斯林 为你写一个故事 MzA5MzAyMzE4Nw==  雷斯林日记 MzA4OTI3Mjc5Mw==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA5MzAyMzE4Nw==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA4OTI3Mjc5Mw==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
//                高晓松 晓松奇谈 MzUxNzk1MDE4NA==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzUxNzk1MDE4NA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
//                贺卫方 静观天下 MzA3MzI0NzM2MA==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzA3MzI0NzM2MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
//                 何兵  何兵 MzI0NDM2MTEzMQ==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzI0NDM2MTEzMQ==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
                //        罗永浩 MjM5NzAxNTkzNg==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MjM5NzAxNTkzNg==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
//                司马南  司马南频道 MzAxODIwNTQ1MA==
                "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid=MzAxODIwNTQ1MA==&type=9&query=&token=1490585033&lang=zh_CN&f=json&ajax=1",
        };
        for (int i = 0; i < Seeds.length; i++) {
            WeixinCrawler crawler = new WeixinCrawler(Seeds[i], "https://mp.weixin.qq.com/.*");
            crawler.setThreads(1);
            crawler.start(3);
            System.out.println(crawler);
        }
    }
}
